In [1]:
import statsmodels.api as sm 
import pandas as pd
import statsmodels.formula.api as smf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np
In [2]:
df=pd.read_csv('/Users/maha/Downloads/wages_by_education.csv')
In [3]:
df.head()
Out[3]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15
3 2019 16.11 21.64 24.00 39.61 51.57 17.55 23.99 26.99 45.74 ... 13.30 17.18 19.17 30.31 40.42 14.50 17.71 19.69 30.18 42.30
4 2018 15.94 21.50 23.70 38.87 51.03 17.70 23.72 26.61 44.97 ... 12.48 16.81 18.57 30.85 39.64 13.47 17.28 19.29 29.47 39.35

5 rows × 61 columns

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             50 non-null     int64  
 1   less_than_hs                     50 non-null     float64
 2   high_school                      50 non-null     float64
 3   some_college                     50 non-null     float64
 4   bachelors_degree                 50 non-null     float64
 5   advanced_degree                  50 non-null     float64
 6   men_less_than_hs                 50 non-null     float64
 7   men_high_school                  50 non-null     float64
 8   men_some_college                 50 non-null     float64
 9   men_bachelors_degree             50 non-null     float64
 10  men_advanced_degree              50 non-null     float64
 11  women_less_than_hs               50 non-null     float64
 12  women_high_school                50 non-null     float64
 13  women_some_college               50 non-null     float64
 14  women_bachelors_degree           50 non-null     float64
 15  women_advanced_degree            50 non-null     float64
 16  white_less_than_hs               50 non-null     float64
 17  white_high_school                50 non-null     float64
 18  white_some_college               50 non-null     float64
 19  white_bachelors_degree           50 non-null     float64
 20  white_advanced_degree            50 non-null     float64
 21  black_less_than_hs               50 non-null     float64
 22  black_high_school                50 non-null     float64
 23  black_some_college               50 non-null     float64
 24  black_bachelors_degree           50 non-null     float64
 25  black_advanced_degree            50 non-null     float64
 26  hispanic_less_than_hs            50 non-null     float64
 27  hispanic_high_school             50 non-null     float64
 28  hispanic_some_college            50 non-null     float64
 29  hispanic_bachelors_degree        50 non-null     float64
 30  hispanic_advanced_degree         50 non-null     float64
 31  white_men_less_than_hs           50 non-null     float64
 32  white_men_high_school            50 non-null     float64
 33  white_men_some_college           50 non-null     float64
 34  white_men_bachelors_degree       50 non-null     float64
 35  white_men_advanced_degree        50 non-null     float64
 36  black_men_less_than_hs           50 non-null     float64
 37  black_men_high_school            50 non-null     float64
 38  black_men_some_college           50 non-null     float64
 39  black_men_bachelors_degree       50 non-null     float64
 40  black_men_advanced_degree        50 non-null     float64
 41  hispanic_men_less_than_hs        50 non-null     float64
 42  hispanic_men_high_school         50 non-null     float64
 43  hispanic_men_some_college        50 non-null     float64
 44  hispanic_men_bachelors_degree    50 non-null     float64
 45  hispanic_men_advanced_degree     50 non-null     float64
 46  white_women_less_than_hs         50 non-null     float64
 47  white_women_high_school          50 non-null     float64
 48  white_women_some_college         50 non-null     float64
 49  white_women_bachelors_degree     50 non-null     float64
 50  white_women_advanced_degree      50 non-null     float64
 51  black_women_less_than_hs         50 non-null     float64
 52  black_women_high_school          50 non-null     float64
 53  black_women_some_college         50 non-null     float64
 54  black_women_bachelors_degree     50 non-null     float64
 55  black_women_advanced_degree      50 non-null     float64
 56  hispanic_women_less_than_hs      50 non-null     float64
 57  hispanic_women_high_school       50 non-null     float64
 58  hispanic_women_some_college      50 non-null     float64
 59  hispanic_women_bachelors_degree  50 non-null     float64
 60  hispanic_women_advanced_degree   50 non-null     float64
dtypes: float64(60), int64(1)
memory usage: 24.0 KB
In [5]:
fig = px.line(df, x='year', y=['less_than_hs', 'high_school', 'some_college', 'bachelors_degree', 'advanced_degree'],
              labels={'value': 'Wage', 'variable': 'Education Level'},
              title='Wages by Education Level in the USA (1973-2022)')
In [6]:
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Wage',
    legend_title='Education Level',
    hovermode='x'
)

fig.show()

Figure #1¶

In [7]:
selected_year = [2020,2021,2022]
In [8]:
filtered_df = df[df['year'].isin(selected_year)]
filtered_df
Out[8]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15

3 rows × 61 columns

In [9]:
filtered_df.plot(x='year', y=['less_than_hs','high_school','some_college','bachelors_degree','advanced_degree'], kind='bar', legend=True)
plt.title('Bar Chart Example')
plt.xlabel('Years')
plt.ylabel('Values')
plt.show()

Figure #2¶

In [10]:
from pandasql import sqldf
In [11]:
df = pd.read_csv('/Users/maha/Downloads/wages_by_education.csv')
In [12]:
pysqldf = sqldf("SELECT * FROM df", globals())
In [13]:
sqldf("SELECT * FROM df")
Out[13]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15
3 2019 16.11 21.64 24.00 39.61 51.57 17.55 23.99 26.99 45.74 ... 13.30 17.18 19.17 30.31 40.42 14.50 17.71 19.69 30.18 42.30
4 2018 15.94 21.50 23.70 38.87 51.03 17.70 23.72 26.61 44.97 ... 12.48 16.81 18.57 30.85 39.64 13.47 17.28 19.29 29.47 39.35
5 2017 15.92 21.26 23.31 38.65 49.40 17.63 23.47 25.91 44.50 ... 12.99 16.33 18.76 29.19 38.26 13.36 17.02 19.60 29.69 38.43
6 2016 15.45 21.00 23.35 38.73 49.64 16.95 23.01 25.98 44.98 ... 12.42 16.71 18.88 29.56 38.55 13.40 17.11 19.26 29.41 37.77
7 2015 15.25 20.81 23.17 37.96 48.42 16.63 22.93 25.99 43.47 ... 12.79 16.21 18.50 30.42 37.37 13.42 16.99 18.66 29.71 36.37
8 2014 14.66 20.34 22.48 36.47 47.27 15.95 22.40 24.96 41.14 ... 12.47 16.09 18.39 30.20 37.88 12.61 16.55 18.22 28.53 37.16
9 2013 14.62 20.31 22.50 37.00 48.54 15.86 22.28 25.07 42.34 ... 12.41 16.60 18.32 29.57 38.40 12.80 16.23 18.70 28.20 36.92
10 2012 14.80 20.50 22.56 36.76 48.47 16.15 22.65 25.05 42.25 ... 12.20 16.52 18.66 29.43 37.46 12.61 16.41 18.78 27.58 38.91
11 2011 14.96 20.67 22.90 36.44 47.56 16.16 22.81 25.34 41.41 ... 13.40 16.56 18.80 30.49 38.01 13.06 16.56 19.44 28.19 39.59
12 2010 15.04 21.06 23.56 37.42 48.80 16.32 23.28 26.24 42.71 ... 13.63 17.33 19.61 30.53 37.74 12.89 16.85 19.65 28.39 41.27
13 2009 15.41 21.42 23.82 37.30 49.12 16.91 23.74 26.75 43.04 ... 13.23 17.26 19.88 29.59 39.37 13.24 17.43 19.63 29.44 41.34
14 2008 15.08 20.93 23.52 37.04 47.55 16.56 23.30 26.32 42.34 ... 12.78 16.69 19.55 30.50 38.24 13.02 16.70 19.86 28.68 37.34
15 2007 15.19 21.04 23.84 37.23 47.31 16.60 23.39 26.66 42.55 ... 13.08 17.22 20.07 29.77 38.75 12.76 16.91 19.94 29.81 40.13
16 2006 14.90 21.17 23.69 36.99 47.23 16.31 23.55 26.39 41.97 ... 13.25 17.02 19.94 30.53 38.06 12.68 16.90 19.62 28.80 38.93
17 2005 14.96 21.07 23.76 36.89 47.09 16.46 23.34 26.49 41.96 ... 12.90 17.07 19.69 29.25 38.33 12.64 16.96 19.73 28.61 38.19
18 2004 15.13 21.33 24.11 36.78 47.48 16.66 23.74 27.03 41.74 ... 12.81 17.51 20.01 30.28 38.36 12.69 17.24 20.08 29.68 37.50
19 2003 15.27 21.50 24.19 37.14 46.79 16.79 23.89 27.04 42.18 ... 13.13 17.59 20.45 31.02 37.78 13.04 17.38 20.13 28.66 35.91
20 2002 15.18 21.39 24.25 37.26 47.36 16.70 23.92 27.10 42.36 ... 12.84 17.41 20.15 30.11 38.80 12.80 16.66 20.03 28.39 37.85
21 2001 14.88 21.10 24.05 37.07 46.23 16.23 23.67 27.12 42.14 ... 13.04 16.97 19.70 30.11 37.22 12.83 16.88 19.71 28.71 36.79
22 2000 14.67 20.80 23.68 36.44 45.99 16.13 23.48 26.78 41.41 ... 12.95 16.81 19.59 30.18 36.87 12.29 16.53 19.38 28.28 35.66
23 1999 14.61 20.67 23.36 35.76 46.02 16.16 23.31 26.34 40.71 ... 12.72 16.42 19.21 29.54 37.04 12.16 16.38 19.03 28.09 35.95
24 1998 14.57 20.39 23.02 34.97 44.49 16.04 22.99 25.86 39.63 ... 12.57 16.25 19.29 28.68 36.92 12.10 16.27 19.21 27.55 34.36
25 1997 14.07 19.95 22.54 33.44 43.68 15.44 22.58 25.33 37.84 ... 12.25 15.83 18.78 26.60 33.90 11.83 15.66 18.37 27.23 34.91
26 1996 13.95 19.62 22.11 32.50 43.35 15.45 22.11 24.79 36.80 ... 11.77 15.70 17.92 26.03 35.73 11.77 15.53 18.28 26.75 38.63
27 1995 13.95 19.68 22.04 32.86 43.27 15.39 22.20 24.78 37.07 ... 12.07 15.53 18.59 27.99 37.90 11.35 15.91 18.17 26.66 36.22
28 1994 14.33 19.88 22.26 32.72 43.46 15.72 22.39 24.89 36.89 ... 12.45 15.79 18.90 27.01 37.24 12.07 16.47 18.86 26.70 36.96
29 1993 14.60 19.65 22.27 32.16 41.45 16.09 22.18 24.84 36.37 ... 12.73 15.84 18.70 27.45 35.67 11.94 16.25 18.79 25.57 36.49
30 1992 14.87 19.65 22.26 32.16 40.94 16.42 22.32 24.95 36.38 ... 13.21 15.73 18.98 27.01 34.10 12.18 15.79 18.89 26.66 34.62
31 1991 14.99 19.77 22.49 31.87 40.71 16.66 22.49 25.46 36.36 ... 12.61 15.84 18.91 26.81 32.58 12.19 16.11 18.51 25.40 33.75
32 1990 15.05 19.74 22.58 32.21 39.95 16.90 22.69 25.75 36.77 ... 12.20 15.63 18.77 25.51 32.97 12.07 15.93 18.33 26.63 32.33
33 1989 15.46 20.07 22.61 31.89 39.80 17.63 23.31 25.78 36.60 ... 12.14 15.69 19.00 25.97 34.39 12.13 15.95 19.17 25.12 32.88
34 1988 15.79 20.66 23.03 33.41 41.12 18.02 24.18 26.71 39.34 ... 12.32 16.17 18.67 26.04 35.80 12.27 15.77 19.43 24.59 31.65
35 1987 15.76 20.60 23.17 33.19 40.55 18.05 24.17 27.02 39.07 ... 12.60 16.03 18.76 25.21 33.45 12.31 15.97 18.92 25.01 32.33
36 1986 16.10 20.61 23.12 32.67 39.88 18.44 24.24 27.04 38.52 ... 12.78 16.09 18.48 24.60 33.04 12.18 16.35 18.68 24.92 28.90
37 1985 16.08 20.40 22.60 31.50 38.31 18.46 24.14 26.25 37.00 ... 12.45 15.87 18.30 25.15 31.28 12.40 15.89 18.38 24.12 29.40
38 1984 16.21 20.30 22.37 31.07 37.45 18.66 24.06 25.97 36.63 ... 12.39 15.65 18.04 23.66 31.81 12.39 16.25 17.79 22.23 30.82
39 1983 16.43 20.43 22.28 30.73 36.78 18.89 24.33 26.04 36.21 ... 12.92 15.94 17.71 23.73 30.64 12.35 16.02 18.94 23.17 26.31
40 1982 16.73 20.65 22.43 30.55 36.08 19.34 24.64 26.22 35.80 ... 12.81 15.95 17.88 23.54 30.65 12.67 16.48 17.68 23.98 30.21
41 1981 17.07 20.68 22.47 30.17 35.32 19.64 24.84 26.24 35.41 ... 12.88 15.86 18.14 23.82 31.05 13.02 16.13 17.49 21.29 31.21
42 1980 17.42 20.90 22.73 30.04 35.53 20.08 25.18 26.63 35.16 ... 12.98 16.09 18.28 23.99 31.58 12.87 16.16 17.86 22.60 28.23
43 1979 17.95 21.62 23.20 30.57 36.19 20.74 26.04 27.18 35.72 ... 13.33 16.30 18.54 23.84 29.34 13.20 16.27 18.02 21.94 30.87
44 1978 17.29 21.52 23.25 30.90 37.56 20.10 25.97 27.14 35.92 ... 12.67 17.00 18.24 25.47 34.08 13.08 15.72 17.56 21.26 28.97
45 1977 17.59 21.50 22.97 31.07 37.36 20.43 25.97 26.70 36.09 ... 12.80 16.30 18.87 25.83 37.10 12.66 17.22 17.12 21.49 30.31
46 1976 17.52 21.76 23.49 31.46 37.50 20.36 26.14 27.10 36.42 ... 12.75 16.40 19.11 26.10 34.92 13.79 17.29 17.40 20.29 20.93
47 1975 17.30 21.55 23.30 31.45 38.41 20.00 26.02 26.93 36.21 ... 12.48 15.88 18.11 27.93 33.03 12.87 16.04 17.04 18.85 23.63
48 1974 17.68 21.60 23.32 31.69 38.37 20.63 26.15 26.79 36.62 ... 12.31 15.55 18.19 27.80 33.86 13.24 16.29 18.17 21.70 26.74
49 1973 18.06 22.22 24.08 32.80 38.16 21.18 26.90 27.67 37.69 ... 12.02 15.79 18.11 28.35 41.81 12.92 16.46 20.22 22.23 25.10

50 rows × 61 columns

In [14]:
comparison_result = (sqldf("SELECT year, AVG(men_high_school) AS men_high_school,AVG(women_high_school) AS women_high_school,AVG(men_advanced_degree) AS men_advanced_degree, AVG(women_advanced_degree) AS women_advanced_degree FROM df GROUP BY year"))
In [15]:
print("Overview of Data Manipulation:")

plt.figure(figsize=(12, 8))
plt.plot(comparison_result['year'], comparison_result['men_high_school'], label='Men - High School')
plt.plot(comparison_result['year'], comparison_result['men_advanced_degree'], label='Men - Advanced Degree')
plt.plot(comparison_result['year'], comparison_result['women_high_school'], label='Women - High School')
plt.plot(comparison_result['year'], comparison_result['women_advanced_degree'], label='Women - Advanced Degree')

plt.title('Comparison of Average Wages for High School-educated Men and Women Over the Years')
plt.xlabel('Year')
plt.ylabel('Average Wages')
plt.legend()
plt.show()
Overview of Data Manipulation: